HawtTxPageFile.java example

Explorer
hawtdb-master
- hawtdb
  - src
    - main
      - java
        org
        fusesource
        hawtdb
        api
        AbstractStreamPagedAccessor.java
        Allocator.java
        BTreeIndexFactory.java
        Callback.java
        CodecPagedAccessor.java
        HashIndexFactory.java
        IOPagingException.java
        Index.java
        IndexException.java
        IndexFactory.java
        IndexVisitor.java
        Journal.java
        MultiIndexFactory.java
        OptimisticUpdateException.java
        OutOfSpaceException.java
        PageFile.java
        PageFileFactory.java
        Paged.java
        PagedAccessor.java
        PagingException.java
        Predicate.java
        Predicates.java
        Prefixer.java
        SortedIndex.java
        StringPrefixer.java
        Transaction.java
        TxPageFile.java
        TxPageFileFactory.java
        internal
        index
        BTreeIndex.java
        BTreeIterator.java
        BTreeNode.java
        BTreePredicateIterator.java
        HashIndex.java
        Logging.java
        MapEntry.java
        io
        MemoryMappedFile.java
        MemoryMappedFileFactory.java
        page
        Batch.java
        Commit.java
        DeferredUpdate.java
        Extent.java
        ExtentInputStream.java
        ExtentOutputStream.java
        HawtPageFile.java
        HawtTransaction.java
        HawtTxPageFile.java
        LFUPageCache.java
        Logging.java
        NoOpPageCache.java
        PageCache.java
        ReadCache.java
        SimpleAllocator.java
        Snapshot.java
        SnapshotTracker.java
        ThreadLocalLFUPageCache.java
        Update.java
        util
        Ranges.java
        util
        Comparators.java
        HexSupport.java
        IOHelper.java
        LFUCache.java
        LockFile.java
        StringSupport.java
        TreeMap.java
        list
        LinkedNode.java
        LinkedNodeList.java
        Sequence.java
        SequenceSet.java
    - test
      - java
        org
        fusesource
        hawtdb
        api
        AbstractApiTest.java
        BTreeIndexApiTest.java
        HashIndexApiTest.java
        MultiIndexApiTest.java
        internal
        Action.java
        ActionActor.java
        Actor.java
        Benchmarker.java
        index
        BTreeCacheTest.java
        BTreeIndexBenchmark.java
        BTreeIndexConcurrencyTest.java
        BTreeIndexTest.java
        ConcurrencyTestSupport.java
        DeferredBTreeIndexTest.java
        DeferredHashIndexTest.java
        FixedCapacityHashIndexTest.java
        HashIndexBenchmark.java
        HashIndexConcurrencyTest.java
        HashIndexTest.java
        IndexBenchmark.java
        IndexTestSupport.java
        RecoveringPageFileTest.java
        RecoveryTest.java
        io
        MemoryMappedFileTest.java
        page
        ExtentTest.java
        TransactionActor.java
        TransactionBenchmark.java
        TransactionBenchmarker.java
        TxPageFileTest.java
        util
        RangesTest.java
        metric
        Metric.java
        MetricAggregator.java
        MetricCounter.java
        Period.java
        tests
        BigIndexTest.java
        GrowIssueTest.java
        util
        LFUCacheTest.java
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fusesource.hawtdb.internal.page;

import org.fusesource.hawtbuf.Buffer;
import org.fusesource.hawtbuf.DataByteArrayInputStream;
import org.fusesource.hawtbuf.DataByteArrayOutputStream;
import org.fusesource.hawtdb.api.*;
import org.fusesource.hawtdb.api.Paged.SliceType;
import org.fusesource.hawtdb.internal.io.MemoryMappedFile;
import org.fusesource.hawtdb.internal.util.Ranges;
import org.fusesource.hawtdb.util.list.LinkedNodeList;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.*;
import java.util.zip.CRC32;

import static org.fusesource.hawtdb.internal.page.Logging.*;

/**
 * Provides concurrent page file access via Multiversion concurrency control
 * (MVCC).
 *
 * Once a transaction begins working against the data, it acquires a snapshot of
 * all the data in the page file. This snapshot is used to provides the
 * transaction consistent view of the data in spite of it being concurrently
 * modified by other transactions.
 *
 * When a transaction does a page update, the update is stored in a temporary
 * page location. Subsequent reads of the original page will result in page read
 * of the temporary page. If the transaction rolls back, the temporary pages are
 * freed. If the transaction commits, the page updates are assigned the next
 * snapshot version number and the update gets queued so that it can be applied
 * atomically at a later time.
 *
 * @author <a href="http://hiramchirino.com">Hiram Chirino</a>
 */
public final class HawtTxPageFile implements TxPageFile {

    public static final int FILE_HEADER_SIZE = 1024 * 4;
    public static final byte[] MAGIC = magic();

    private static byte[] magic() {
        try {
            byte rc[] = new byte[32];
            byte[] tmp = "HawtDB:1.0\n".getBytes("UTF-8");
            System.arraycopy(tmp, 0, rc, 0, tmp.length);
            return rc;
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * The first 4K of the file is used to hold 2 copies of the header.
     * Each copy is 2K big.  The header is checksummed so that corruption
     * can be detected.
     */
    static private class Header {

        /** Identifies the file format */
        public volatile byte[] magic = new byte[32];
        /** The oldest applied commit revision */
        public volatile long base_revision;
        /** The size of each page in the page file */
        public volatile int page_size;
        /** The page location of the free page list */
        public volatile int free_list_page;
        /** Where it is safe to resume recovery... Will be
         *  -1 if no recovery is needed. */
        public volatile int pessimistic_recovery_page;
        /** We try to recover from this point.. but it may fail since it's
         *  writes have not been synced to disk. */
        public volatile int optimistic_recovery_page;

        public String toString() {
            return "{ base_revision: " + this.base_revision
                    + ", page_size: " + page_size + ", free_list_page: " + free_list_page
                    + ", pessimistic_recovery_page: " + pessimistic_recovery_page
                    + ", optimistic_recovery_page: " + optimistic_recovery_page
                    + " }";
        }

        private final DataByteArrayOutputStream os = new DataByteArrayOutputStream(FILE_HEADER_SIZE);

        Buffer encode() {
            try {
                os.reset();
                os.write(magic);
                os.writeLong(base_revision);
                os.writeInt(page_size);
                os.writeInt(free_list_page);
                os.writeInt(pessimistic_recovery_page);
                os.writeInt(optimistic_recovery_page);

                int length = os.position();
                byte[] data = os.getData();

                CRC32 checksum = new CRC32();
                checksum.update(data, 0, length);

                os.position((FILE_HEADER_SIZE / 2) - 8);
                os.writeLong(checksum.getValue());
                System.arraycopy(data, 0, data, FILE_HEADER_SIZE / 2, length);
                os.position(FILE_HEADER_SIZE / 2 - 8);
                os.writeLong(checksum.getValue());

                return os.toBuffer();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }

        void decode(Buffer buffer) throws PagingException {
            DataByteArrayInputStream is = new DataByteArrayInputStream(buffer);
            int length = readFields(is);
            is.setPos((FILE_HEADER_SIZE / 2) - 8);
            long expectedChecksum = is.readLong();
            CRC32 checksum = new CRC32();
            checksum.update(buffer.data, 0, length);
            if (checksum.getValue() != expectedChecksum) {
                // Try the 2nd copy..
                is.setPos(FILE_HEADER_SIZE / 2);
                length = readFields(is);
                is.setPos(FILE_HEADER_SIZE - 8);
                expectedChecksum = is.readLong();
                checksum = new CRC32();
                checksum.update(buffer.data, 0, length);
                if (checksum.getValue() != expectedChecksum) {
                    throw new PagingException("file header corruption detected.");
                }
            }
        }

        private int readFields(DataByteArrayInputStream is) {
            is.readFully(magic);
            base_revision = is.readLong();
            page_size = is.readInt();
            free_list_page = is.readInt();
            pessimistic_recovery_page = is.readInt();
            optimistic_recovery_page = is.readInt();
            int length = is.getPos();
            return length;
        }

    }
    /** The header structure of the file */
    private final Header header = new Header();
    private final LinkedNodeList<Batch> batches = new LinkedNodeList<Batch>();
    private final MemoryMappedFile file;
    final Allocator allocator;
    final HawtPageFile pageFile;
    private static final int updateBatchSize = 1024;
    private final boolean synch;
    private volatile int lastBatchPage = -1;
    //
    // The following batch objects point to linked nodes in the previous batch list.
    // They are used to track/designate the state of the batch object.
    //
    /** The current batch that is currently being assembled. */
    volatile Batch openBatch;
    /** The batches that are being stored... These might be be recoverable. */
    volatile Batch storingBatches;
    /** The stored batches. */
    volatile Batch storedBatches;
    /** The performed batches.  Page updates have been copied from the redo pages to the original page locations. */
    volatile Batch performedBatches;
    /** A read cache used to speed up access to frequently used pages */
    volatile ReadCache readCache;

    //
    // Profilers like yourkit just tell which mutex class was locked.. so create a different class for each mutex
    // so we can more easily tell which mutex was locked.
    //
    private static class HOUSE_KEEPING_MUTEX {

        public String toString() {
            return "HOUSE_KEEPING_MUTEX";
        }

    }

    private static class TRANSACTION_MUTEX {

        public String toString() {
            return "TRANSACTION_MUTEX";
        }

    }
    /**
     * Mutex for data structures which are used during house keeping tasks like batch
     * management. Once acquired, you can also acquire the TRANSACTION_MUTEX
     */
    private final HOUSE_KEEPING_MUTEX HOUSE_KEEPING_MUTEX = new HOUSE_KEEPING_MUTEX();
    /**
     * Mutex for data structures which transaction threads access. Never attempt to
     * acquire the HOUSE_KEEPING_MUTEX once this mutex is acquired.
     */
    final TRANSACTION_MUTEX TRANSACTION_MUTEX = new TRANSACTION_MUTEX();
    /**
     * This is the free page list at the base revision.  It does not
     * track allocations in transactions or committed updates.  Only
     * when the updates are performed will this list be updated.
     *
     * The main purpose of this list is to initialize the free list
     * on recovery.
     *
     * This does not track the space associated with batch lists
     * and free lists.  On recovery that space is discovered and
     * tracked in the page file allocator.
     */
    private Ranges storedFreeList = new Ranges();
    private final ExecutorService worker;

    public HawtTxPageFile(TxPageFileFactory factory, HawtPageFile pageFile) {
        this.pageFile = pageFile;
        this.synch = factory.isSync();
        this.file = pageFile.getFile();
        this.allocator = pageFile.allocator();
        this.readCache = new ReadCache(pageFile, factory.getPageCache());

        if (factory.isUseWorkerThread()) {
            worker = Executors.newSingleThreadExecutor(new ThreadFactory() {

                public Thread newThread(Runnable r) {
                    Thread rc = new Thread(r);
                    rc.setName("HawtDB Worker");
                    rc.setDaemon(true);
                    return rc;
                }

            });
        } else {
            worker = null;
        }
    }

    public ReadCache readCache() {
        return readCache;
    }

    public void close() {
        if (worker != null) {
            final CountDownLatch done = new CountDownLatch(1);
            worker.execute(new Runnable() {

                public void run() {
                    done.countDown();
                    worker.shutdownNow();
                }

            });
            try {
                done.await();
            } catch (InterruptedException e) {
            }
        }
        flush();
        performBatches();
    }

    @Override
    public String toString() {
        return "{\n"
                + "  allocator: " + allocator + ",\n"
                + "  synch: " + synch + ",\n"
                + "  read cache size: " + readCache.cache().size() + ",\n"
                + "  base revision free pages: " + storedFreeList + ",\n"
                + "  batches: {\n"
                + "    performed: " + toString(performedBatches, storedBatches) + ",\n"
                + "    stored: " + toString(storedBatches, storingBatches) + ",\n"
                + "    storing: " + toString(storingBatches, openBatch) + ",\n"
                + "    open: " + toString(openBatch, null) + ",\n"
                + "  }" + "\n"
                + "}";
    }

    /**
     * @param from
     * @param to
     * @return string representation of the batch items from the specified batch up to (exclusive) the specified batch.
     */
    private String toString(Batch from, Batch to) {
        StringBuilder rc = new StringBuilder();
        rc.append("[ ");
        Batch t = from;
        while (t != null && t != to) {
            if (t != from) {
                rc.append(", ");
            }
            rc.append(t);
            t = t.getNext();
        }
        rc.append(" ]");
        return rc.toString();
    }

    /* (non-Javadoc)
     * @see org.fusesource.hawtdb.internal.page.TransactionalPageFile#tx()
     */
    public Transaction tx() {
        return new HawtTransaction(this);
    }

    /**
     * Attempts to commit a set of page updates.
     *
     * @param snapshot
     * @param pageUpdates
     * @param flushCallbacks
     */
    void commit(Snapshot snapshot, ConcurrentHashMap<Integer, Update> pageUpdates, ArrayList<Runnable> flushCallbacks) {

        boolean fullBatch = false;
        Commit commit = null;
        synchronized (TRANSACTION_MUTEX) {

            // we need to figure out the revision id of the this commit...
            long rev;
            if (snapshot != null) {

                // Lets check for an OptimisticUpdateException
                // verify that the new commit's updates don't conflict with a commit that occurred
                // subsequent to the snapshot that this commit started operating on.

                // Note: every deferred update has an entry in the pageUpdates, so no need to
                // check to see if that map also conflicts.
                rev = snapshot.getTracker().commitCheck(pageUpdates);
                snapshot.close();
            } else {
                rev = openBatch.head;
            }
            rev++;


            if (flushCallbacks != null) {
                openBatch.flushCallbacks.addAll(flushCallbacks);
            }

            commit = openBatch.commits.getTail();

            if (commit != null && commit.snapshotTracker == null) {
                // just merge /w the previous commit if it does not have an open snapshot.
                // TODO: we are inside the TRANSACTION_MUTEX ... and this seems CPU intensive..
                // but it's better than always creating more commit entries.. as that slows down
                // page look up (the have to iterate through all the commits).
                commit.merge(pageFile.allocator(), rev, pageUpdates);
            } else {
                commit = new Commit(rev, pageUpdates);
                openBatch.commits.addLast(commit);
            }

            if (openBatch.base == -1) {
                openBatch.base = rev;
            }
            openBatch.head = rev;


            if (openBatch.pageCount() > updateBatchSize) {
                fullBatch = true;
            }
        }

        if (fullBatch) {
            trace("batch full.");
            synchronized (HOUSE_KEEPING_MUTEX) {
                storeBatches(false);
            }

            if (worker != null) {
                worker.execute(new Runnable() {

                    public void run() {
                        synchronized (HOUSE_KEEPING_MUTEX) {
                            syncBatches();
                        }
                    }

                });
            } else {
                synchronized (HOUSE_KEEPING_MUTEX) {
                    syncBatches();
                }
            }
        }
    }

    /**
     * Used to initialize a new file or to clear out the
     * contents of an existing file.
     */
    public void reset() {
        synchronized (HOUSE_KEEPING_MUTEX) {
            batches.clear();
            performedBatches = storedBatches = storingBatches = openBatch = new Batch(-1);
            batches.addFirst(openBatch);

            lastBatchPage = -1;
            readCache.cache().clear();

            allocator.clear();
            storedFreeList.clear();
            storedFreeList.add(0, allocator.getLimit());

            // Initialize the file header..
            System.arraycopy(MAGIC, 0, header.magic, 0, MAGIC.length);
            header.base_revision = -1;
            header.free_list_page = -1;
            header.page_size = pageFile.getPageSize();
            header.pessimistic_recovery_page = -1;
            header.optimistic_recovery_page = -1;
            storeHeader();
        }
    }

    /**
     * Loads an existing file and replays the batch
     * logs to put it in a consistent state.
     */
    public void recover() {
        synchronized (HOUSE_KEEPING_MUTEX) {

            batches.clear();
            performedBatches = storedBatches = storingBatches = openBatch = new Batch(-1);
            batches.addFirst(openBatch);
            lastBatchPage = -1;
            readCache.cache().clear();

            Buffer buffer = new Buffer(FILE_HEADER_SIZE);
            file.read(0, buffer);
            header.decode(buffer);

            if (!Arrays.equals(MAGIC, header.magic)) {
                throw new PagingException("The file header is not of the expected type.");
            }

            trace("recovery started.  header: %s", header);

            // Initialize the free page list.
            if (header.free_list_page >= 0) {
                storedFreeList = loadObject(header.free_list_page);
                trace("loaded free page list: %s ", storedFreeList);
                allocator.setFreeRanges(storedFreeList);
                Extent.unfree(pageFile, header.free_list_page);
            } else {
                allocator.clear();
                storedFreeList.add(0, allocator.getLimit());
            }

            int pageId = header.pessimistic_recovery_page;
            if (header.optimistic_recovery_page >= 0) {
                pageId = header.optimistic_recovery_page;
            }

            LinkedList<Batch> loaded = new LinkedList<Batch>();

            boolean consistencyCheckNeeded = true;
            while (pageId >= 0) {

                trace("loading batch at: %d", pageId);
                Batch batch = null;

                if (pageId == header.pessimistic_recovery_page) {
                    consistencyCheckNeeded = false;
                }

                if (consistencyCheckNeeded) {
                    // write could be corrupted.. lets be careful
                    try {
                        batch = loadObject(pageId);
                    } catch (Exception e) {
                        trace("incomplete batch at: %d", pageId);
                        // clear out any previously loaded batchs.. and
                        // resume from the pessimistic location.
                        loaded.clear();
                        pageId = header.pessimistic_recovery_page;
                        continue;
                    }
                } else {
                    // it should load fine..
                    batch = loadObject(pageId);
                }

                batch.page = pageId;
                batch.recovered = true;
                loaded.add(batch);

                trace("loaded batch: %s", batch);

                // is this the last batch we need to load?
                if (header.base_revision + 1 == batch.base) {
                    break;
                }

                pageId = batch.previous;
            }

            if (loaded.isEmpty()) {
                trace("no batches need to be recovered.");
            } else {

                // link up the batch objects...
                for (Batch batch : loaded) {

                    // makes sure the batch pages are not in the free list.
                    Extent.unfree(pageFile, batch.page);

                    if (openBatch.head == -1) {
                        openBatch.head = batch.head;
                    }

                    // add first since we are loading batch objects oldest to youngest
                    // but want to put them in the list youngest to oldest.
                    batches.addFirst(batch);
                    performedBatches = storedBatches = batch;
                }

                // Perform the updates..
                performBatches();
                syncBatches();
            }
        }
    }

    /* (non-Javadoc)
     * @see org.fusesource.hawtdb.internal.page.TransactionalPageFile#flush()
     */
    public void flush() {
        synchronized (HOUSE_KEEPING_MUTEX) {
            storeBatches(true);
            syncBatches();
        }
    }

    public void flush(final Runnable onComplete) {
        if (worker != null) {
            worker.execute(new Runnable() {

                public void run() {
                    flush();
                    onComplete.run();
                }

            });
        } else {
            flush();
            onComplete.run();
        }

    }

    // /////////////////////////////////////////////////////////////////
    //
    // Methods which transition bathes through their life cycle states:
    // open -> storing -> stored -> performing -> performed -> released
    //
    //   state: open - you can add additional commits to the batch
    //
    //   on: batch size limit reached
    //   action: write the batch to disk
    //           update optimistic_recovery_page
    //
    //   state: storing - batch was written to disk, but not synced.. batch may be lost on failure.
    //
    //      on: disk sync
    //      action: update pessimistic_recovery_page
    //
    //   state: stored - we know know the batch can be recovered.  Updates will not be lost once we hit this state.
    //
    //     on: original pages drained of open snapshots
    //     action: copy shadow pages to original pages
    //
    //   state: performing - original pages are being updated.  Updates might be partially applied.
    //
    //      on: disk sync
    //
    //   state performed: original pages no updated.
    //
    //      action: the batch becomes the base revision, new snapshot can refer to the original page locations.
    //
    //      on: batch drained of open snapshots
    //
    //   state: released - The batch is no longer being used.
    //
    //      action: free the batch shadow pages
    //
    //
    // /////////////////////////////////////////////////////////////////
    /**
     * Attempts to perform a batch state change: open -> storing
     */
    private void storeBatches(boolean force) {
        Batch batch;

        // We synchronized /w the transactions so that they see the state change.
        synchronized (TRANSACTION_MUTEX) {
            // Re-checking since storing the batch may not be needed.
            if ((force && openBatch.base != -1) || openBatch.pageCount() > updateBatchSize) {
                batch = openBatch;
                openBatch = new Batch(batch.head);
                batches.addLast(openBatch);
            } else {
                return;
            }
        }

        // Write any outstanding deferred cache updates...
        batch.performDeferredUpdates(pageFile);

        // Link it to the last batch.
        batch.previous = lastBatchPage;

        // Store the batch record.
        lastBatchPage = batch.page = storeObject(batch);
        trace("stored batch: %s", batch);


        // Update the header to know about the new batch page.
        header.optimistic_recovery_page = batch.page;
        storeHeader();
    }

    /**
     * Performs a file sync.
     *
     * This allows two types of batch state changes to occur:
     * <ul>
     * <li> storing -> stored
     * <li> performed -> released
     * </ul>
     */
    private void syncBatches() {

        // This is a slow operation..
        if (synch) {
            file.sync();
        }

        // Update the base_revision with the last performed revision.
        if (performedBatches != storedBatches) {
            Batch lastPerformedBatch = storedBatches.getPrevious();
            header.base_revision = lastPerformedBatch.head;
        }

        // Were there some batches in the stored state?
        if (storingBatches != openBatch) {

            // Callback the runnables which were waiting for the updates to be
            // fully flushed to disk.
            Batch cur = storingBatches;
            while (cur != openBatch) {
                for (Runnable runnable : storingBatches.flushCallbacks) {
                    try {
                        runnable.run();
                    } catch (Throwable e) {
                        e.printStackTrace();
                    }
                }
                cur = cur.getNext();
            }


            // The last stored is actually synced now..
            Batch lastStoredBatch = openBatch.getPrevious();
            // Let the header know about it..
            header.pessimistic_recovery_page = lastStoredBatch.page;
            if (header.optimistic_recovery_page == header.pessimistic_recovery_page) {
                header.optimistic_recovery_page = -1;
            }

            // We synchronized /w the transactions so that they see the state change.
            synchronized (TRANSACTION_MUTEX) {
                // Transition stored -> synced.
                storingBatches = openBatch;
            }
        }

        // apply any batches that can be applied..
        performBatches();

        // Once a batch has been performed, subsequently synced, and no longer referenced,
        // it's allocated recovery space can be released.
        while (performedBatches != storedBatches) {

            if (performedBatches.snapshots != 0) {
                break;
            }

            if (performedBatches.page == header.pessimistic_recovery_page) {
                header.pessimistic_recovery_page = -1;
            }

            // Free the update pages associated with the batch.
            performedBatches.release(allocator);

            // Free the batch record itself.
            Extent.free(pageFile, performedBatches.page);

            // don't need to sync /w transactions since they don't use the performedBatches variable.
            // Transition performed -> released
            performedBatches = performedBatches.getNext();

            // removes the released batch form the batch list.
            performedBatches.getPrevious().unlink();
        }

        // Store the free list..
        int previousFreeListPage = header.free_list_page;
        header.free_list_page = storeObject(storedFreeList);
        storeHeader();

        // Release the previous free list.
        if (previousFreeListPage >= 0) {
            Extent.free(pageFile, previousFreeListPage);
        }
    }

    /**
     * Attempts to perform a batch state change: stored -> performed
     *
     * Once a batch is performed, new snapshots will not reference
     * the batch anymore.
     */
    public void performBatches() {

        if (storedBatches == storingBatches) {
            // There are no batches in the synced state for use to transition.
            return;
        }

        // The last performed batch MIGHT still have an open snapshot.
        // we can't transition from synced, until that snapshot closes.
        Batch lastPerformed = storedBatches.getPrevious();
        if (lastPerformed != null && lastPerformed.snapshots != 0) {
            return;
        }

        while (storedBatches != storingBatches) {

            trace("Performing batch: %s", storedBatches);

            // Performing the batch actually applies the updates to the original page locations.
            for (Commit commit : storedBatches) {
                for (Entry<Integer, Update> entry : commit.updates.entrySet()) {
                    int page = entry.getKey();
                    Update update = entry.getValue();

                    if (traced(page) || (update.shadowed() && traced(update.shadow()))) {
                        trace("performing update at %d %s", page, update);
                    }
                    // is it a shadow update?
                    if (update.shadowed()) {

                        if (storedBatches.recovered) {
                            // If we are recovering, the allocator MIGHT not have the shadow
                            // page as being allocated.  This makes sure it's allocated so that
                            // new transaction to get this page and overwrite it in error.
                            allocator.unfree(update.shadow(), 1);
                        }

                        // Perform the update by copying the updated page the original
                        // page location.

                        if (traced(page) || traced(update.shadow())) {
                            trace("performing shadow update on %d from %d", page, update.shadow());
                        }
                        ByteBuffer slice = pageFile.slice(SliceType.READ, update.shadow(), 1);
                        try {
                            pageFile.write(page, slice);
                        } finally {
                            pageFile.unslice(slice);
                        }

                    }
                    if (update.allocated()) {

                        if (storedBatches.recovered) {
                            // If we are recovering, the allocator MIGHT not have this
                            // page as being allocated.  This makes sure it's allocated so that
                            // new transaction to get this page and overwrite it in error.
                            allocator.unfree(page, 1);
                        }
                        // Update the persistent free list.  This gets stored on the next sync.
                        storedFreeList.remove(page, 1);

                    } else if (update.freed()) {
                        storedFreeList.add(page, 1);
                    }

                    // update the read cache..
                    DeferredUpdate du = update.deferredUpdate();
                    if (du != null) {
                        if (du.removed()) {
                            readCache.cache().remove(page);
                        } else if (du.put()) {
                            readCache.cache().put(page, du.value);
                        }
                    }

                }
            }

            storedBatches.performed = true;

            // We synchronized /w the transactions so that they see the state change.
            synchronized (TRANSACTION_MUTEX) {
                // Transition synced -> performed
                storedBatches = storedBatches.getNext();
            }

            lastPerformed = storedBatches.getPrevious();
            // We have to stop if the last batch performed has an open snapshot.
            if (lastPerformed.snapshots != 0) {
                break;
            }
        }
    }

    // /////////////////////////////////////////////////////////////////
    // Snapshot management
    // /////////////////////////////////////////////////////////////////
    Snapshot openSnapshot() {
        synchronized (TRANSACTION_MUTEX) {

            // re-use the last entry if it was a snapshot head..
            Commit commit = openBatch.getHeadCommit();
            SnapshotTracker tracker = null;

            if (commit != null) {
                if (commit.snapshotTracker == null) {
                    // So we can track the new snapshot...
                    commit.snapshotTracker = new SnapshotTracker(openBatch, commit);
                }
                tracker = commit.snapshotTracker;
            } else {
                tracker = new SnapshotTracker(openBatch, null);
            }

            // Open the snapshot
            return new Snapshot(this, tracker, storedBatches, openBatch).open();
        }
    }

    // /////////////////////////////////////////////////////////////////
    // Helper methods
    // /////////////////////////////////////////////////////////////////
    private int storeObject(Object value) {
        try {
            ExtentOutputStream eos = new ExtentOutputStream(pageFile);
            ObjectOutputStream oos = new ObjectOutputStream(eos);
            oos.writeObject(value);
            oos.close();
            return eos.getPage();
        } catch (IOException e) {
            throw new IOPagingException(e);
        }
    }

    @SuppressWarnings("unchecked")
    private <T> T loadObject(int pageId) {
        try {
            ExtentInputStream eis = new ExtentInputStream(pageFile, pageId);
            ObjectInputStream ois = new ObjectInputStream(eis);
            return (T) ois.readObject();
        } catch (IOException e) {
            throw new IOPagingException(e);
        } catch (ClassNotFoundException e) {
            throw new IOPagingException(e);
        }
    }

    private void storeHeader() {
        trace("storing file header: %s", header);
        file.write(0, header.encode());
    }

}